# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import time # for timing operations on large dataframes
import random
random.seed(42)
%matplotlib inline
Load and Preview Data
# load a single year's dataset into a pandas dataframe
start = time.time()
flights = pd.read_csv('2008.csv') #flights_2008
end = time.time()
currTime = str(time.localtime().tm_hour) + ':' + str(time.localtime().tm_min) + ':' + str(time.localtime().tm_sec)
print('time elapsed: ' + str(end - start) + ', current time: ' + currTime)
# high-level overview of data shape and composition
print(flights.shape)
print(flights.dtypes)
Initial Observations:
# Backup the data
flights_orig = flights.copy(deep=True) # version to leave unaltered
flights_all = flights.copy(deep=True) # version to modify / use as necessary
# sample 100000 flights, then display head to get a better sense of the data
samples = np.random.choice(flights_all.shape[0], 100000, replace = False)
flights = flights_all.loc[samples,:]
flights.head(10)
Determine which columns are not shown above, then visualize them
flights.columns
cols_to_show = ['TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'ActualElapsedTime',
'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest', 'Distance']
flights[cols_to_show].head(10)
flights.describe()
flights.info()
Define a function to plot multiple standard scale histograms
def plot_histograms_stdAxis(flights, var, xaxis_label, numPlots, min_vals, max_vals, binsizes):
fig, ax = plt.subplots(nrows=numPlots, figsize = [8, numPlots * 3.5])
for i in range(numPlots):
bin_edges = np.arange(min_vals[i], max_vals[i]+binsizes[i], binsizes[i])
ax[i].hist(data = flights, x = var, bins = bin_edges);
ax[i].set_title(flights[var].name + '\n' +
'min: ' + str(min_vals[i]) +
', max: ' + str(max_vals[i]) +
', binsize: ' +
str(binsizes[i]) + ' ',
pad=-30, loc='right')
plt.xlabel(xaxis_label)
plt.show()
flights.dtypes
Create standard scale histograms of the various 'Delay' parameters
vars = ['ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']
numPlots = 7;
for i in range(len(vars)):
var = vars[i]
min_values = [flights[var].min(), flights[var].min(),
-50, -50, -50, -50, -50]
max_values = [flights[var].max(), flights[var].max(),
300, 300, 150, 150, 150]
binsizes = [5, 2, 5, 2, 5, 2, 1]
plot_histograms_stdAxis(flights, var, 'Delay (min)', numPlots, min_values, max_values, binsizes)
print('------------------------------------------------------------------------------------')
Zoom in on the 'ArrDelay' histogram to get a better sense of its distribution
vars = ['ArrDelay']
numPlots = 2;
for i in range(len(vars)):
var = vars[i]
min_values = [-50, -50]
max_values = [150, 150]
binsizes = [2, 1]
plot_histograms_stdAxis(flights, var, 'Delay (min)', numPlots, min_values, max_values, binsizes)
Repeat for 'DepDelay'
vars = ['DepDelay']
numPlots = 2;
for i in range(len(vars)):
var = vars[i]
min_values = [-25, -25]
max_values = [100, 100]
binsizes = [2, 1]
plot_histograms_stdAxis(flights, var, 'Delay (min)', numPlots, min_values, max_values, binsizes)
'Delay' Variables - Initial Observations
Define a function to plot multiple log scale histograms
def plot_histograms_logAxis(flights, var, xaxis_label, numPlots, max_vals, log_binsizes, ticks):
fig, ax = plt.subplots(nrows=numPlots, figsize = [8, numPlots * 3.5])
labels = ['{}'.format(v) for v in ticks]
for i in range(numPlots):
#bin_edges = np.arange(min_vals[i], max_vals[i]+binsizes[i], binsizes[i])
#bin_edges = np.arange(min_vals[i], max_vals[i]+binsizes[i], binsizes[i])
bin_edges = 10 ** np.arange(0, np.log10(max_vals[i])+log_binsizes[i], log_binsizes[i])
ax[i].hist(data = flights, x = var, bins = bin_edges);
ax[i].set_xscale('log')
ax[i].xaxis.set_ticks(ticks)
ax[i].xaxis.set_ticklabels(labels)
ax[i].set_title(flights[var].name + '\n' +
'max: ' + str(max_vals[i]) +
', log_binsize: ' +
str(log_binsizes[i]) + ' ',
pad=-30, loc='right')
plt.xlabel(xaxis_label)
plt.show()
Create log-scale histograms of the 'Delay' variables that have a large number of points near zero
vars = ['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']
numPlots = 6;
for i in range(len(vars)):
var = vars[i]
max_values = [flights[var].max(),
flights[var].max(),
300, 300, 300, 300]
log_binsizes = [0.5, 0.3, 0.3, 0.25, 0.2, 0.15]
ticks = [1, 3, 10, 30, 100, 300, 1000]
plot_histograms_logAxis(flights, var, 'Delay (min)', numPlots, max_values, log_binsizes, ticks)
print('------------------------------------------------------------------------------------')
Create standard scale histograms of the variables that have a large number of points near zero
'CarrierDelay' distribution
vars = ['CarrierDelay']
numPlots = 2;
for i in range(len(vars)):
var = vars[i]
min_values = [1, 1]
max_values = [150, 150]
binsizes = [2, 1]
plot_histograms_stdAxis(flights, var, 'Delay (min)', numPlots, min_values, max_values, binsizes)
'WeatherDelay' distribution
vars = ['WeatherDelay']
numPlots = 4;
for i in range(len(vars)):
var = vars[i]
min_values = [1, 1, 1, 1]
max_values = [300, 250, 150, 150]
binsizes = [5, 3, 2, 1]
plot_histograms_stdAxis(flights, var, 'Delay (min)', numPlots, min_values, max_values, binsizes)
'NASDelay' distribution
vars = ['NASDelay']
numPlots = 2;
for i in range(len(vars)):
var = vars[i]
min_values = [1, 1]
max_values = [150, 40]
binsizes = [1, 1]
plot_histograms_stdAxis(flights, var, 'Delay (min)', numPlots, min_values, max_values, binsizes)
'LateAircraftDelay' distribution
vars = ['LateAircraftDelay']
numPlots = 2;
for i in range(len(vars)):
var = vars[i]
min_values = [1, 1]
max_values = [300, 150]
binsizes = [2, 2]
plot_histograms_stdAxis(flights, var, 'Delay (min)', numPlots, min_values, max_values, binsizes)
Create log-scale histograms of 'ArrDelay' and 'DepDelay' variables
vars = ['ArrDelay', 'DepDelay']
numPlots = 8;
for i in range(len(vars)):
var = vars[i]
max_values = [flights[var].max(),
flights[var].max(),
300, 300, 300, 300, 300, 300]
log_binsizes = [0.5, 0.3, 0.3, 0.25, 0.2, 0.15, 0.1, 0.05]
ticks = [1, 3, 10, 30, 100, 300, 1000]
plot_histograms_logAxis(flights, var, 'Delay (min)', numPlots, max_values, log_binsizes, ticks)
print('------------------------------------------------------------------------------------')
vars = ['ArrDelay']
numPlots = 4;
for i in range(len(vars)):
var = vars[i]
max_values = [flights_all[var].max(),
flights_all[var].max(),
flights_all[var].max(),
flights_all[var].max()]
log_binsizes = [0.15, 0.1, 0.08, 0.05]
#ticks = [1, 3, 10, 30, 100, 300, 1000, 3000]
ticks = [10, 30, 100, 300, 1000, 3000]
#ticks = [100, 300, 1000, 3000]
plot_histograms_logAxis(flights, var, 'Delay (min)', numPlots, max_values, log_binsizes, ticks)
print('------------------------------------------------------------------------------------')
'Delay' Variables - Additional Observations
Select other variables for exploration
flights.dtypes
Numerical variables of potential interest (create distributions):
Categorical variables of potential interest (create bar plots):
vars_distr = ['Month', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'ActualElapsedTime',
'CRSElapsedTime', 'AirTime', 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'Diverted']
numPlots = 3;
for i in range(len(vars_distr)):
var = vars_distr[i]
min_values = [flights[var].min(), flights[var].min(), flights[var].min()]
max_values = [flights[var].max(), flights[var].max(), flights[var].max()]
binsizes = [5, 2, 1]
plot_histograms_stdAxis(flights, var, 'Value', numPlots, min_values, max_values, binsizes)
print('------------------------------------------------------------------------------------')
Zoom in on 'CRSElapsedTime' distribution
numPlots = 4;
var = 'CRSElapsedTime'
min_values = [flights[var].min(), flights[var].min(), flights[var].min(), 40]
max_values = [flights[var].max(), 200, 200, 100]
binsizes = [5, 2, 1, 1]
plot_histograms_stdAxis(flights, var, 'Value', numPlots, min_values, max_values, binsizes)
Zoom in on 'Distance' distribution
numPlots = 4;
var = 'Distance'
min_values = [flights[var].min(), flights[var].min(), flights[var].min(), flights[var].min()]
max_values = [flights[var].max(), flights[var].max(), flights[var].max(), flights[var].max()]
binsizes = [100, 50, 30, 10]
plot_histograms_stdAxis(flights, var, 'Value', numPlots, min_values, max_values, binsizes)
Add 'Date', 'DayOfYear', and 'Week' parameters
# add Date, Week, and DayOfYear parameters to the dataset
temp_date = pd.to_datetime(flights.Year*10000 +
flights.Month*100 +
flights.DayofMonth, format='%Y%m%d')
flights['Date'] = temp_date
flights['DayOfYear'] = flights['Date'].dt.dayofyear
flights['Week'] = flights['Date'].dt.week
# quick hack / correct issue where the last few days of the year errantly indicate 'Week' == 1
mask1 = flights.Month == 12
mask2 = flights.DayofMonth >= 29
flights.loc[mask1 & mask2, 'Week'] = 53
# also add Date, Week, DayOfYear parameters to the larger dataset (in case it is needed later)
temp_date = pd.to_datetime(flights_all.Year*10000 +
flights_all.Month*100 +
flights_all.DayofMonth, format='%Y%m%d')
flights_all['Date'] = temp_date
flights_all['DayOfYear'] = flights_all['Date'].dt.dayofyear
flights_all['Week'] = flights_all['Date'].dt.week
# quick hack / correct issue where the last few days of the year errantly indicate 'Week' == 1
mask1 = flights_all.Month == 12
mask2 = flights_all.DayofMonth >= 29
flights_all.loc[mask1 & mask2, 'Week'] = 53
Plot the 'DayOfYear' distribution
numPlots = 3;
var = 'DayOfYear'
min_values = [flights[var].min(), flights[var].min(), flights[var].min()]
max_values = [flights[var].max()+1, flights[var].max()+1, flights[var].max()+1]
binsizes = [5, 2, 1]
plot_histograms_stdAxis(flights, var, 'Value', numPlots, min_values, max_values, binsizes)
Plot the 'Week' distribution
numPlots = 2;
var = 'Week'
min_values = [flights[var].min(), flights[var].min()]
max_values = [flights[var].max()+1, flights[var].max()+1]
binsizes = [2, 1]
plot_histograms_stdAxis(flights, var, 'Value', numPlots, min_values, max_values, binsizes)
Plot counts of categorical variables - top 10 for 'Origin', 'Dest', and 'UniqueCarrier'
print(flights.groupby('Origin').Origin.count().sort_values(ascending=False).count())
print(flights.groupby('Dest').Dest.count().sort_values(ascending=False).count())
print(flights.groupby('UniqueCarrier').UniqueCarrier.count().sort_values(ascending=False).count())
origins = flights.groupby('Origin').Origin.count().sort_values(ascending=False)
destinations = flights.groupby('Dest').Dest.count().sort_values(ascending=False)
carriers = flights.groupby('UniqueCarrier').UniqueCarrier.count().sort_values(ascending=False)
origins_head = (origins.head(10))
destinations_head = (destinations.head(10))
carriers_head = (carriers.head(10))
default_color = sb.color_palette()[0]
plt.figure(figsize = [8, 3])
ax = sb.barplot(origins_head.index, origins_head.values, color = default_color);
plt.figure(figsize = [8, 3])
ax = sb.barplot(destinations_head.index, origins_head.values, color = default_color);
plt.figure(figsize = [8, 3])
ax = sb.barplot(carriers_head.index, carriers_head.values, color = default_color);
top10_origin = flights.groupby('Origin').Origin.count().sort_values(ascending=False)[0:10].index.tolist()
top10_dest = flights.groupby('Dest').Origin.count().sort_values(ascending=False)[0:10].index.tolist()
top10_carrier = flights.groupby('UniqueCarrier').UniqueCarrier.count().sort_values(ascending=False)[0:10].index.tolist()
print(top10_origin, '\n---------\n', top10_carrier)
Other Variables - Observations:
Observations (largely repeated from prior comments):
Most likely variables of interest:
The nature of several distributions is discussed in the section immediately preceeding this question
I performed very little cleaning or tidying of the data.
I did add three new features: 'Date' 'Week' and 'DayOfYear'
Initial approach:
#choose vars to explore
vars = ['ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay',
'LateAircraftDelay', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'ActualElapsedTime',
'CRSElapsedTime', 'AirTime', 'Distance', 'TaxiIn', 'TaxiOut', 'DayOfYear']
# correlation plot
plt.figure(figsize = [16, 10])
sb.heatmap(flights[vars].corr(), annot = True, fmt = '.3f', cmap = 'vlag_r', center = 0)
plt.show()
Remove 'SecurityDelay', 'ActualElaspedTime', 'CRSElapsedTime', 'AirTime', 'Distance', and 'DayOfYear' from the heat map:
#choose vars to explore (removed: 'SecurityDelay', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'Distance')
vars = ['ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'LateAircraftDelay',
'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'TaxiIn', 'TaxiOut']
# correlation plot
plt.figure(figsize = [12, 8])
sb.heatmap(flights[vars].corr(), annot = True, fmt = '.3f', cmap = 'vlag_r', center = 0)
plt.show()
Look at a heat map with just the delay variables, along with ['DepTime', 'CRSDepTime', 'TaxiIn', 'TaxiOut'] to see if it makes more sense or if anything else sticks out
#choose vars to explore (removed: 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime')
vars = ['ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'LateAircraftDelay',
'TaxiIn', 'TaxiOut']
# correlation plot
plt.figure(figsize = [8, 6])
sb.heatmap(flights[vars].corr(), annot = True, fmt = '.3f', cmap = 'vlag_r', center = 0)
plt.show()
Recall from univariate exploration:
Based on this, it might be useful to look at correlations where those variables are not zero
Planned approach:
def filter_zeros(df, varCompare, varFilt):
mask = df[varFilt] > 0
df = df[mask]
df = df[[varCompare, varFilt]]
return df
varsCorr = ['ArrDelay', 'DepDelay', 'CarrierDelay', 'WeatherDelay', 'NASDelay',
'LateAircraftDelay', 'TaxiIn', 'TaxiOut']
varsFilt = ['CarrierDelay', 'WeatherDelay', 'NASDelay', 'LateAircraftDelay']
flights_varsCorr = pd.DataFrame(index=varsCorr, columns=varsCorr)
for i in range(len(varsCorr)):
# debug statements
#print('\'' + varsCorr[i] + '\': ' + str(flights[varsCorr[i]].count()) + ' rows')
for j in range(len(varsCorr)):
# case: variable being assessed against itself
if varsCorr[i] == varsCorr[j]:
current_corr = 1
# debug statements
#print(' 1, corr,shape w/ ' + varsCorr[j] + ': ' + str(current_corr) +
# ', i_count:' + str(flights[varsCorr[i]].count()) +
# ', j_count:' + str(flights[varsCorr[j]].count()))
# case: 'comparison' variable being assessed against a 'filter' variable
elif varsCorr[j] in varsFilt:
df_twoVars = filter_zeros(flights, varsCorr[i], varsCorr[j]) # return 2 var df, filtered vars[j]>0
current_corr = df_twoVars[varsCorr[i]].corr(df_twoVars[varsCorr[j]])
# debug statements
#print(' f1, corr,shape w/ ' + varsCorr[j] + ': ' + str(current_corr) +
# ', i_count:' + str(df_twoVars[varsCorr[i]].count()) +
# ', j_count:' + str(df_twoVars[varsCorr[j]].count()))
# base case: non-delay variable assessed against a non-delay variable
else:
current_corr = flights[varsCorr[i]].corr(flights[varsCorr[j]])
# debug statements
#print(' reg, corr,shape w/ ' + varsCorr[j] + ': ' + str(current_corr) +
# ', i_count:' + str(flights[varsCorr[i]].count()) +
# ', j_count:' + str(flights[varsCorr[j]].count()))
flights_varsCorr.loc[varsCorr[i]][varsCorr[j]] = current_corr
# debug statements
#flights_varsCorr
varsCorr_nonZero = ['ArrDelay', 'DepDelay', 'CarrierDelay_nonZero', 'WeatherDelay_nonZero', 'NASDelay_nonZero',
'LateACDelay_nonZero', 'TaxiIn', 'TaxiOut']
# convert from string to float
for i in range(len(varsCorr)):
flights_varsCorr[varsCorr[i]] = flights_varsCorr[varsCorr[i]].astype(float)
# correlation plot
plt.figure(figsize = [12, 6])
ax = sb.heatmap(flights_varsCorr, annot = True, fmt = '.3f', annot_kws={"size": 14},
cmap = 'vlag_r', center = 0,
xticklabels=varsCorr_nonZero)
ax.figure.axes[-1].tick_params(labelsize=14)
ax.tick_params(labelsize=14)
plt.xticks(rotation=30, ha='right')
title_string=('Correlation Coefficients after removing \'zero\' values from select columns')
plt.suptitle(title_string, x=0.45, y=0.97, fontsize=18)
plt.show()
Observations:
Next, I am going to create some violin plots to assess how the distributions of ArrDelay and DepDelay vary based on a second variable:
print(top10_origin, '\n---------\n', top10_carrier)
# flights_reduced_orig = flights[flights.Origin.isin(top10_origin)]
# flights_reduced_orig = flights_reduced_orig[flights_reduced_orig.UniqueCarrier.isin(top10_carrier)]
#
# flights_reduced_dest = flights[flights.Dest.isin(top10_origin)]
# flights_reduced_dest = flights_reduced_dest[flights_reduced_dest.UniqueCarrier.isin(top10_carrier)]
default_color = sb.color_palette()[0]
flights_reduced_orig = flights[flights.Origin.isin(top10_origin)]
flights_reduced_dest = flights[flights.Dest.isin(top10_dest)]
flights_reduced_carrier = flights[flights.UniqueCarrier.isin(top10_carrier)]
# plot arrival delay distribution against origin, dest, and unique carrier
categoric_vars = ['Origin', 'Dest', 'UniqueCarrier']
fig, ax = plt.subplots(ncols = 1, nrows = len(categoric_vars), figsize = [6,4*len(categoric_vars)], squeeze=False)
for i in range(len(categoric_vars)):
var = categoric_vars[i]
if var == 'Origin':
sb.violinplot(data = flights_reduced_orig, x = var, y = 'ArrDelay', ax = ax[i,0], color = default_color)
elif var == 'Dest':
sb.violinplot(data = flights_reduced_dest, x = var, y = 'ArrDelay', ax = ax[i,0], color = default_color)
else:
sb.violinplot(data = flights_reduced_carrier, x = var, y = 'ArrDelay', ax = ax[i,0], color = default_color)
ax[i,0].set(ylim=(-60,100))
plt.show()
# plot departure delay distribution against origin, dest, and unique carrier
categoric_vars = ['Origin', 'Dest', 'UniqueCarrier']
fig, ax = plt.subplots(ncols = 1, nrows = len(categoric_vars), figsize = [6,4*len(categoric_vars)], squeeze=False)
for i in range(len(categoric_vars)):
var = categoric_vars[i]
if var == 'Origin':
sb.violinplot(data = flights_reduced_orig, x = var, y = 'DepDelay', ax = ax[i,0], color = default_color)
elif var == 'Dest':
sb.violinplot(data = flights_reduced_dest, x = var, y = 'DepDelay', ax = ax[i,0], color = default_color)
else:
sb.violinplot(data = flights_reduced_carrier, x = var, y = 'DepDelay', ax = ax[i,0], color = default_color)
ax[i,0].set(ylim=(-30,100))
plt.show()
Observations:
# plot arrival delay distribution against origin, dest, and unique carrier
categoric_vars = ['Origin', 'Dest', 'UniqueCarrier']
fig, ax = plt.subplots(ncols = 1, nrows = len(categoric_vars), figsize = [6,5*len(categoric_vars)], squeeze=False)
for i in range(len(categoric_vars)):
var = categoric_vars[i]
if var == 'Origin':
sb.violinplot(data = flights_reduced_orig, y = var, x = 'ArrDelay', ax = ax[i,0], color = default_color)
elif var == 'Dest':
sb.violinplot(data = flights_reduced_dest, y = var, x = 'ArrDelay', ax = ax[i,0], color = default_color)
else:
sb.violinplot(data = flights_reduced_carrier, y = var, x = 'ArrDelay', ax = ax[i,0], color = default_color)
ax[i,0].set(xlim=(-30,40))
plt.show()
# plot departure delay distribution against origin, dest, and unique carrier
categoric_vars = ['Origin', 'Dest', 'UniqueCarrier']
fig, ax = plt.subplots(ncols = 1, nrows = len(categoric_vars), figsize = [6,5*len(categoric_vars)], squeeze=False)
for i in range(len(categoric_vars)):
var = categoric_vars[i]
if var == 'Origin':
sb.violinplot(data = flights_reduced_orig, y = var, x = 'DepDelay', ax = ax[i,0], color = default_color)
elif var == 'Dest':
sb.violinplot(data = flights_reduced_dest, y = var, x = 'DepDelay', ax = ax[i,0], color = default_color)
else:
sb.violinplot(data = flights_reduced_carrier, y = var, x = 'DepDelay', ax = ax[i,0], color = default_color)
ax[i,0].set(xlim=(-20,40))
plt.show()
Observations:
# plot arrival delay distribution against origin, dest, and unique carrier
categoric_vars = ['Origin', 'Dest', 'UniqueCarrier']
fig, ax = plt.subplots(ncols = 1, nrows = len(categoric_vars), figsize = [6,5*len(categoric_vars)], squeeze=False)
for i in range(len(categoric_vars)):
var = categoric_vars[i]
if var == 'Origin':
#sb.violinplot(data = flights_reduced_orig, y = var, x = 'ArrDelay', ax = ax[i,0], color = default_color)
sb.boxplot(data=flights_reduced_orig, y=var, x='ArrDelay', ax=ax[i,0], color=default_color, whis=0.5, sym='')
elif var == 'Dest':
#sb.violinplot(data = flights_reduced_dest, y = var, x = 'ArrDelay', ax = ax[i,0], color = default_color)
sb.boxplot(data=flights_reduced_dest, y=var, x='ArrDelay', ax=ax[i,0], color=default_color, whis=0.5, sym='')
else:
#sb.violinplot(data = flights_reduced_carrier, y = var, x = 'ArrDelay', ax = ax[i,0], color = default_color)
sb.boxplot(data=flights_reduced_carrier, y=var, x='ArrDelay', ax=ax[i,0], color=default_color, whis=0.5, sym='')
ax[i,0].set(xlim=(-30,40))
plt.show()
# plot departure delay distribution against origin, dest, and unique carrier
categoric_vars = ['Origin', 'Dest', 'UniqueCarrier']
fig, ax = plt.subplots(ncols = 1, nrows = len(categoric_vars), figsize = [6,5*len(categoric_vars)], squeeze=False)
for i in range(len(categoric_vars)):
var = categoric_vars[i]
if var == 'Origin':
#sb.violinplot(data = flights_reduced_orig, y = var, x = 'DepDelay', ax = ax[i,0], color = default_color)
sb.boxplot(data=flights_reduced_orig, y=var, x='DepDelay', ax=ax[i,0], color=default_color, whis=0.5, sym='')
elif var == 'Dest':
#sb.violinplot(data = flights_reduced_dest, y = var, x = 'DepDelay', ax = ax[i,0], color = default_color)
sb.boxplot(data=flights_reduced_dest, y=var, x='DepDelay', ax=ax[i,0], color=default_color, whis=0.5, sym='')
else:
#sb.violinplot(data = flights_reduced_carrier, y = var, x = 'DepDelay', ax = ax[i,0], color = default_color)
sb.boxplot(data=flights_reduced_carrier, y=var, x='DepDelay', ax=ax[i,0], color=default_color, whis=0.5, sym='')
ax[i,0].set(xlim=(-20,40))
plt.show()
(Top of Page)
Box Plot Observations:
# hist2d returns a number of different variables, including an array of counts
bins_x = np.arange(0, 366+1, 1)
bins_y = np.arange(0, 300+5, 5)
h2d = plt.hist2d(data = flights, y = 'ArrDelay', x = 'DayOfYear',
bins = [bins_x, bins_y], cmap = 'viridis_r', cmin = 1)
plt.ylabel('ArrDelay')
plt.xlabel('DayOfYear');
x = flights['DayOfYear']
y = flights['ArrDelay']
x_min_val = 0
x_max_val = 366
x_binSize = 10
x_bin_edges = np.arange(x_min_val, x_max_val+x_binSize, x_binSize)
y_min_val = 0
y_max_val = 300
y_logBinSize = 0.2
y_bin_edges = 10 ** np.arange(y_min_val, np.log10(y_max_val)+y_logBinSize, y_logBinSize)
counts, _, _ = np.histogram2d(x, y, bins=(x_bin_edges, y_bin_edges))
fig, ax = plt.subplots()
plt.pcolormesh(x_bin_edges, y_bin_edges, counts.T, cmap = 'viridis_r')
tick_locs = [1, 10, 100]
#tick_locs = [0, 1, 3, 10, 30, 100, 300]
labels = ['{}'.format(v) for v in tick_locs]
plt.ylabel('ArrDelay')
plt.xlabel('DayOfYear')
plt.yticks(tick_locs, labels, fontsize=14)
ax.figure.axes[-1].set_yscale('log')
plt.colorbar()
plt.show()
Observations:
x = flights['DayOfYear']
y = flights['DepDelay']
x_min_val = 0
x_max_val = 366
x_binSize = 10
x_bin_edges = np.arange(x_min_val, x_max_val+x_binSize, x_binSize)
y_min_val = 0
y_max_val = 300
y_logBinSize = 0.2
y_bin_edges = 10 ** np.arange(y_min_val, np.log10(y_max_val)+y_logBinSize, y_logBinSize)
counts, _, _ = np.histogram2d(x, y, bins=(x_bin_edges, y_bin_edges))
fig, ax = plt.subplots()
plt.pcolormesh(x_bin_edges, y_bin_edges, counts.T, cmap = 'viridis_r')
tick_locs = [1, 10, 100]
#tick_locs = [0, 1, 3, 10, 30, 100, 300]
labels = ['{}'.format(v) for v in tick_locs]
plt.ylabel('DepDelay')
plt.xlabel('DayOfYear')
plt.yticks(tick_locs, labels, fontsize=14)
ax.figure.axes[-1].set_yscale('log')
plt.colorbar()
plt.show()
Observations:
x = flights['ArrTime']
y = flights['ArrDelay']
x_min_val = 0
x_max_val = 2400
x_binSize = 120
x_bin_edges = np.arange(x_min_val, x_max_val+x_binSize, x_binSize)
y_min_val = 0
y_max_val = 300
y_logBinSize = 0.2
y_bin_edges = 10 ** np.arange(y_min_val, np.log10(y_max_val)+y_logBinSize, y_logBinSize)
counts, _, _ = np.histogram2d(x, y, bins=(x_bin_edges, y_bin_edges))
fig, ax = plt.subplots()
plt.pcolormesh(x_bin_edges, y_bin_edges, counts.T, cmap = 'viridis_r')
tick_locs = [1, 10, 100]
#tick_locs = [0, 1, 3, 10, 30, 100, 300]
labels = ['{}'.format(v) for v in tick_locs]
plt.ylabel('ArrDelay')
plt.xlabel('ArrTime')
plt.yticks(tick_locs, labels, fontsize=14)
ax.figure.axes[-1].set_yscale('log')
plt.colorbar()
plt.show()
Observations:
x = flights['DepTime']
y = flights['DepDelay']
x_min_val = 0
x_max_val = 2400
x_binSize = 120
x_bin_edges = np.arange(x_min_val, x_max_val+x_binSize, x_binSize)
y_min_val = 0
y_max_val = 300
y_logBinSize = 0.2
y_bin_edges = 10 ** np.arange(y_min_val, np.log10(y_max_val)+y_logBinSize, y_logBinSize)
counts, _, _ = np.histogram2d(x, y, bins=(x_bin_edges, y_bin_edges))
fig, ax = plt.subplots()
plt.pcolormesh(x_bin_edges, y_bin_edges, counts.T, cmap = 'viridis_r')
tick_locs = [1, 10, 100]
#tick_locs = [0, 1, 3, 10, 30, 100, 300]
labels = ['{}'.format(v) for v in tick_locs]
plt.ylabel('DepDelay')
plt.xlabel('DepTime')
plt.yticks(tick_locs, labels, fontsize=14)
ax.figure.axes[-1].set_yscale('log')
plt.colorbar()
plt.show()
Observations:
x = flights_all['DayOfYear']
y = flights_all['WeatherDelay']
x_min_val = 0
x_max_val = 366
x_binSize = 15
x_bin_edges = np.arange(x_min_val, x_max_val+x_binSize, x_binSize)
y_min_val = 1
y_max_val = 300
y_logBinSize = 0.15
y_bin_edges = 10 ** np.arange(y_min_val, np.log10(y_max_val)+y_logBinSize, y_logBinSize)
counts, _, _ = np.histogram2d(x, y, bins=(x_bin_edges, y_bin_edges))
fig, ax = plt.subplots()
plt.pcolormesh(x_bin_edges, y_bin_edges, counts.T, cmap = 'viridis_r')
tick_locs = [1, 10, 100]
#tick_locs = [0, 1, 3, 10, 30, 100, 300]
labels = ['{}'.format(v) for v in tick_locs]
plt.ylabel('WeatherDelay')
plt.xlabel('DayOfYear')
plt.yticks(tick_locs, labels, fontsize=14)
ax.figure.axes[-1].set_yscale('log')
plt.colorbar()
plt.show()
Observations:
Here is a more legible version of this particular chart (WeatherDelay 2D Histogram) for potential inclusion in the final report:
x = flights_all['DayOfYear']
y = flights_all['WeatherDelay']
x_min_val = 0
x_max_val = 366
x_binSize = 15
x_bin_edges = np.arange(x_min_val, x_max_val+x_binSize, x_binSize)
y_min_val = 1
y_max_val = 300
y_logBinSize = 0.09
y_bin_edges = 10 ** np.arange(y_min_val, np.log10(y_max_val)+y_logBinSize, y_logBinSize)
counts, _, _ = np.histogram2d(x, y, bins=(x_bin_edges, y_bin_edges))
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=[9,5])
plt.pcolormesh(x_bin_edges, y_bin_edges, counts.T, cmap = 'Reds')
tick_locs = [10, 30, 100, 300]
labels = ['{}'.format(v) for v in tick_locs]
plt.ylabel('Weather Delay (min)', fontsize=16)
plt.xlabel('Day Of Year', fontsize=16)
plt.xticks(fontsize=14)
ax.figure.axes[-1].set_yscale('log')
plt.yticks(tick_locs, labels, fontsize=14)
#axis.set_major_formatter(ScalarFormatter())
plt.colorbar()
ax.figure.axes[-1].tick_params(labelsize=14)
ax.figure.axes[-1].set_ylabel('Number of Flights', size=14)
#title_string=('Number of Flights')
#plt.suptitle(title_string, x=0.45, y=0.97, fontsize=18)
plt.show()
Observations:
Observations (largely repeated from prior comments):
Regarding which variables affect Arrival and Departure Delay:
'ActualElaspedTime', 'CRSElapsedTime', 'AirTime', and 'Distance' are only significantly correlated with one another. They should be removed from consideration.
When zero values are removed, 'ArrDelay' and 'DepDelay' are most strongly correlated with 'CarrierDelay', 'WeatherDelay', and 'LateAircraftDelay'
Airports:
Airlines:
General observation:
To focus the multivariate analysis, I am only going to assess Arrival and Departure Delays against the following airports / airlines:
I will also check to see how 'WeatherDelay' affects arrival and departure delays
I plan to start by:
mask1 = flights_all['WeatherDelay'] > 0
flights_weather = flights_all[mask1]
flights_weather.ArrDelay.describe()
flights_weather.ArrDelay.median()
mask1 = flights['LateAircraftDelay'] > 0
flights_lateAC = flights[mask1]
#flights_lateAC.ArrDelay.describe()
mask1 = flights['CarrierDelay'] > 0
flights_carrier = flights[mask1]
#flights_carrier.ArrDelay.describe()
plt.figure(figsize = [12, 6])
ax = plt.scatter(data = flights_weather, x='DayOfYear', y='ArrDelay', alpha=1, cmap='Reds', c='WeatherDelay');
plt.xlabel('Day of Year', fontsize=14)
plt.ylabel('Arrival Delay (min)', fontsize=14)
plt.yscale('log')
tick_locs = [100, 200, 300, 500, 1000, 2000]
tick_locs = [30, 100, 300, 1000]
plt.yticks(tick_locs, tick_locs, fontsize=14)
plt.xticks(fontsize=14)
plt.ylim(30, 2000)
plt.colorbar()
ax.figure.axes[-1].tick_params(labelsize=14)
ax.figure.axes[-1].set_ylabel('Weather Delay (min)', size=14)
title_string=('Arrival Delay vs Day of Year')
plt.suptitle(title_string, x=0.45, y=0.92, fontsize=14)
plt.show()
Observations:
Function to create colored scatter plots:
def plot_color_scatter(df, varX, varY, varColor, titleX, titleY, titleFig, titleColor):
plt.figure(figsize = [9, 4])
ax = plt.scatter(data = df, x=varX, y=varY, alpha=1, cmap='Reds', c=varColor);
plt.xlabel(titleX, fontsize=12)
plt.ylabel(titleY, fontsize=12)
plt.yscale('log')
tick_locs = [100, 200, 300, 500, 1000, 2000]
plt.yticks(tick_locs, tick_locs, fontsize=12)
plt.xticks(fontsize=14)
plt.ylim(100, 2000)
plt.colorbar()
ax.figure.axes[-1].tick_params(labelsize=12)
ax.figure.axes[-1].set_ylabel(titleColor, size=12)
plt.suptitle(titleFig, x=0.45, y=0.86, fontsize=12)
plt.show()
The same chart as before, but with a smaller data set and modified y-axis:
varX = 'DayOfYear'
varY = 'ArrDelay'
varColor = 'WeatherDelay'
df_to_plot = flights[(flights[varColor] > 0)]
titleFig=('Arrival Delay vs Day of Year')
plot_color_scatter(df_to_plot, varX, varY, varColor, varX, varY, titleFig, varColor)
Observations:
Next steps:
Create colored scatter plots for flights with a destination of ORD, SFO, or DFW:
varX = 'DayOfYear'
varY = 'ArrDelay'
varColor = 'WeatherDelay'
varFilt = 'Dest'
valuesFilt = ['ORD', 'SFO', 'DFW']
for i in range(len(valuesFilt)):
df_to_plot = flights_all[(flights_all[varColor] > 0) & (flights_all[varFilt] == valuesFilt[i])]
titleFig = (varFilt + ': ' + valuesFilt[i])
plot_color_scatter(df_to_plot, varX, varY, varColor, varX, varY, titleFig, varColor)
Repeat, but this time for Origins and Departure Delays:
varX = 'DayOfYear'
varY = 'DepDelay'
varColor = 'WeatherDelay'
varFilt = 'Origin'
valuesFilt = ['ORD', 'SFO', 'DFW']
for i in range(len(valuesFilt)):
df_to_plot = flights_all[(flights_all[varColor] > 0) & (flights_all[varFilt] == valuesFilt[i])]
titleFig = (varFilt + ': ' + valuesFilt[i])
plot_color_scatter(df_to_plot, varX, varY, varColor, varX, varY, titleFig, varColor)
'Observations':
Now, let's repeat both sets of charts, but this time filtering by some of the top airlines (UA, AA, and US)
varX = 'DayOfYear'
varY = 'ArrDelay'
varColor = 'WeatherDelay'
varFilt = 'UniqueCarrier'
valuesFilt = ['UA', 'AA', 'US']
for i in range(len(valuesFilt)):
df_to_plot = flights_all[(flights_all[varColor] > 0) & (flights_all[varFilt] == valuesFilt[i])]
titleFig = (varFilt + ': ' + valuesFilt[i])
plot_color_scatter(df_to_plot, varX, varY, varColor, varX, varY, titleFig, varColor)
varX = 'DayOfYear'
varY = 'DepDelay'
varColor = 'WeatherDelay'
varFilt = 'UniqueCarrier'
valuesFilt = ['UA', 'AA', 'US']
for i in range(len(valuesFilt)):
df_to_plot = flights_all[(flights_all[varColor] > 0) & (flights_all[varFilt] == valuesFilt[i])]
titleFig = (varFilt + ': ' + valuesFilt[i])
plot_color_scatter(df_to_plot, varX, varY, varColor, varX, varY, titleFig, varColor)
Observations:
Next Steps:
Start by writing a function that reduces a dataframe to its median delay values:
# group a df by the column 'varGroupBy', collect medians, only keep 'varsKeep' columns, then reshape the df
'''
- assumes that elements of varskeep are as follows:
- 0th element: variable to groupby / pivoting: values
- 1st element: 'Week' variable / pivoting: index
- 2nd element: 'DayOfWeek' variable / pivoting: columns
'''
def reduce_df_median(data, varGroupBy, varsKeep):
# reduce the dataframe
data_new = data.groupby(varGroupBy).median()
data_new = data_new[varsKeep]
# convert data type to int
data_new['Week'] = data_new['Week'].astype(np.int64)
data_new['DayOfWeek'] = data_new['DayOfWeek'].astype(np.int64)
# reshape the dataframe
data_new = pd.pivot_table(data_new, values = varsKeep[0], index=[varsKeep[1]], columns = varsKeep[2])
return data_new
Using the above function, create two "calendar" data frames, one for arrival delays, and one for departure delays
flights_days_Arr = reduce_df_median(flights, 'DayOfYear', ['ArrDelay', 'DayOfWeek', 'Week'])
flights_days_Dep = reduce_df_median(flights, 'DayOfYear', ['DepDelay', 'DayOfWeek', 'Week'])
flights_days_Arr.head(10)
The subsequent plots will not plot the NaN values, but that is appropriate since they just reflect the particular calendar for 2008
# will be used to keep vmin and vmax values consistent across calendar plots
minArrDelay_median = flights_days_Arr.min().sort_values(ascending=True).iloc[0]
maxArrDelay_median = flights_days_Arr.max().sort_values(ascending=False).iloc[0]
minDepDelay_median = flights_days_Dep.min().sort_values(ascending=True).iloc[0]
maxDepDelay_median = flights_days_Dep.max().sort_values(ascending=False).iloc[0]
print(minArrDelay_median)
print(maxArrDelay_median)
print(minDepDelay_median)
print(maxDepDelay_median)
Create a "calendar plot" of median arrival delay (color) vs day of the week (y-axis) vs day of the year (x-axis)
plt.figure(figsize = [24, 2.5])
ax = sb.heatmap(flights_days_Arr, cmap = 'vlag', center = 0, cbar=True, xticklabels=3,
vmin = minArrDelay_median, vmax = maxArrDelay_median)
plt.xlabel('Week', fontsize=18)
plt.ylabel('Day of Week', fontsize=18)
plt.xticks(rotation=0, fontsize=18)
plt.yticks(rotation=0, ha='right', fontsize=18)
ax.figure.axes[-1].tick_params(labelsize=16)
ax.figure.axes[-1].set_ylabel('Arrival Delay (min)', size=18)
title_string=('Median Arrival Delay\n(All Airports and Carriers)')
plt.suptitle(title_string, x=0.45, y=1.12, fontsize=18)
plt.show()
Repeat for Departure delays
# used to keep vmin and vmax values consistent across calendar plots
minArrDelay_median = flights_days_Arr.min().sort_values(ascending=True).iloc[0]
maxArrDelay_median = flights_days_Arr.max().sort_values(ascending=False).iloc[0]
plt.figure(figsize = [24, 2.5])
ax = sb.heatmap(flights_days_Dep, cmap = 'vlag', center = 0, cbar=True, xticklabels=3,
vmin = minArrDelay_median, vmax = maxArrDelay_median)
plt.xlabel('Week', fontsize=18)
plt.ylabel('Day of Week', fontsize=18)
plt.xticks(rotation=0, fontsize=18)
plt.yticks(rotation=0, ha='right', fontsize=18)
ax.figure.axes[-1].tick_params(labelsize=16)
ax.figure.axes[-1].set_ylabel('Departure Delay (min)', size=18)
title_string=('Median Departure Delay\n(All Airports and Carriers)')
plt.suptitle(title_string, x=0.45, y=1.12, fontsize=18)
plt.show()
Observations:
Next Steps:
A function to create a calendar plot:
def plotCal(df, mask, delayType):
df_new = df[mask]
df_reduced = reduce_df_median(df_new, 'DayOfYear', [delayType, 'DayOfWeek', 'Week'])
# "calendar matrix" of flights with the given mask
plt.figure(figsize = [24, 2.5])
ax = sb.heatmap(df_reduced, cmap = 'vlag', center = 0, cbar=True, xticklabels=3)
plt.xlabel('Week', fontsize=18)
plt.ylabel('Day of Week', fontsize=18)
plt.xticks(rotation=0, fontsize=18)
plt.yticks(rotation=0, ha='right', fontsize=18)
ax.figure.axes[-1].tick_params(labelsize=14)
if delayType == 'ArrDelay':
ax.figure.axes[-1].set_ylabel('Arrival Delay (min)', size=18)
title_string=('Median Arrival Delay')
else:
ax.figure.axes[-1].set_ylabel('Departure Delay (min)', size=18)
title_string=('Median Departure Delay')
plt.suptitle(title_string, x=0.45, y=1.04, fontsize=18)
plt.show()
Function to create a calendar plot with a specific vmin and vmax:
def plotCal_specific_cBar(df, mask, delayType, cBarMin, cBarMax):
df_new = df[mask]
df_reduced = reduce_df_median(df_new, 'DayOfYear', [delayType, 'DayOfWeek', 'Week'])
# "calendar matrix" of flights with the given mask
plt.figure(figsize = [24, 2.5])
ax = sb.heatmap(df_reduced, cmap = 'vlag', center = 0, cbar=True, xticklabels=3, vmin=cBarMin, vmax=cBarMax)
plt.xlabel('Week', fontsize=18)
plt.ylabel('Day of Week', fontsize=18)
plt.xticks(rotation=0, fontsize=18)
plt.yticks(rotation=0, ha='right', fontsize=18)
ax.figure.axes[-1].tick_params(labelsize=14)
if delayType == 'ArrDelay':
ax.figure.axes[-1].set_ylabel('Arrival Delay (min)', size=18)
title_string=('Median Arrival Delay')
else:
ax.figure.axes[-1].set_ylabel('Departure Delay (min)', size=18)
title_string=('Median Departure Delay')
plt.suptitle(title_string, x=0.45, y=1.04, fontsize=18)
plt.show()
Function to plot Arrival and Departure delays for a given airport:
def plotCal_Airport(df, airport):
# retrieve the median arrival delays for flights with <airport> as a destination
mask = flights_all['Dest'] == airport # select flights with a given mask
df_new = df[mask]
df_reduced_arr = reduce_df_median(df_new, 'DayOfYear', ['ArrDelay', 'DayOfWeek', 'Week'])
# retrieve the median departure delays for flights with <airport> as an origin
mask = flights_all['Origin'] == airport # select flights with a given mask
df_new = df[mask]
df_reduced_dep = reduce_df_median(df_new, 'DayOfYear', ['DepDelay', 'DayOfWeek', 'Week'])
# determine cBarMin and cBarMax
minArr = df_reduced_arr.min().sort_values(ascending=True).iloc[0]
maxArr = df_reduced_arr.max().sort_values(ascending=False).iloc[0]
minDep = df_reduced_dep.min().sort_values(ascending=True).iloc[0]
maxDep = df_reduced_dep.max().sort_values(ascending=False).iloc[0]
cBarMin = min(minArr, minDep)
cBarMax = max(maxArr, maxDep)
plotCal_specific_cBar(df, flights_all['Dest'] == airport, 'ArrDelay', cBarMin, cBarMax)
plotCal_specific_cBar(df, flights_all['Origin'] == airport, 'DepDelay', cBarMin, cBarMax)
Function to plot Arrival and Departure delays for a given airline:
def plotCal_Airline(df, airline):
# retrieve the median arrival delays for flights with the given airline
mask = flights_all['UniqueCarrier'] == airline # select flights with a given mask
df_new = df[mask]
df_reduced_arr = reduce_df_median(df_new, 'DayOfYear', ['ArrDelay', 'DayOfWeek', 'Week'])
# retrieve the median departure delays for flights with the given airline
df_reduced_dep = reduce_df_median(df_new, 'DayOfYear', ['DepDelay', 'DayOfWeek', 'Week'])
# determine cBarMin and cBarMax
minArr = df_reduced_arr.min().sort_values(ascending=True).iloc[0]
maxArr = df_reduced_arr.max().sort_values(ascending=False).iloc[0]
minDep = df_reduced_dep.min().sort_values(ascending=True).iloc[0]
maxDep = df_reduced_dep.max().sort_values(ascending=False).iloc[0]
cBarMin = min(minArr, minDep)
cBarMax = max(maxArr, maxDep)
plotCal_specific_cBar(df, flights_all['UniqueCarrier'] == airline, 'ArrDelay', cBarMin, cBarMax)
plotCal_specific_cBar(df, flights_all['UniqueCarrier'] == airline, 'DepDelay', cBarMin, cBarMax)
Arrival and Departure delays for 'ORD':
plotCal_Airport(flights_all, 'ORD') # plot the calendar matrix
Arrival and Departure delays for 'SFO':
plotCal_Airport(flights_all, 'SFO') # plot the calendar matrix
Arrival and Departure delays for 'DFW':
plotCal_Airport(flights_all, 'DFW') # plot the calendar matrix
Observations:
Arrival and Departure delays for 'UA':
plotCal_Airline(flights_all, 'UA') # plot the calendar matrix
Arrival and Departure delays for 'AA':
plotCal_Airline(flights_all, 'AA') # plot the calendar matrix
Arrival and Departure delays for 'US':
plotCal_Airline(flights_all, 'US') # plot the calendar matrix
Observations:
Observations (largely repeated from prior comments):
As previously suspected, weather does seem to be a notable swing factor affect the length of arrival and departure delays.
There also appear to be unique cycles wherein a given airport and/or airline is likely to have longer delays.
Weather and Time of year:
Some airlines are less susceptible to weather-related delays
If you wanted to have little to no delay in 2008, some good ways to attempt this would have been:
Overall: time of year, weather, airport, and airline are significant factors that affect the likelihood of having a long or a short delay
Week 14 (March 31 - April 6) was interesting, in that there were a spike of delays on the Monday and Friday of that week. I suspect this may have been the peak spring break week in 2008.
ORD had a string of arrival delays on Mondays in the beginning of the year. I suspect this may have been a confluence of business travel and winter weather
UA had a period in week 23 (June 2 - June 8) wherein the arrival delays rivaled those seen during the holidays.
US consistently had shorter delays than the other two airlines and often arrived early!
Some of the links served as style guides on how to perform a given operation. However, most of them have just been copied below because they seemed interesting and/or applicable to the types of analysis and plots herein.
Here is a reference for setting matplotlib title properties
Reference for adding text to matplotlib plots
Reference for adding a figure title and subplot titles
Types of charts - graphic scheme
A compilation of the "Top 50 most useful" matplotlib plots in data analysis and visualization
Python Graph Gallery, a site with hundreds of different chart types (and code samples) that can be created using Python
List of Seaborn color palettes
Matplotlib documentation on chooosing colormaps
Seaborn heatmap documentation
Single Page "cheat sheet" for plotting with Seaborn
StackOverflow page describing how to fix irregular spacing of xticklabels that have been rotated
Matplotlib - colorbar documentation
Matplotlib documentation: some annotated heatmap examples
StackOverflow page describing how to modify colorbar properties - references plots that are plotted using the 'Fig, ax' coding style
StackOverflow example of how to append the same string to a list of strings in Python:
[s + mystring for s in mylist]StackOverflow page on how to convert multiple columns into one datetime column in pandas
Interesting StackOverflow example of creating a calendar heatmap without Pandas
Calmap library for plotting Calendar heatmaps from Pandas time series data
StackOverflow page on how to pivot DataFrames
StackOverflow page - Performing log-scale axis transformations and resizing the colorbar ticks on a mesh grid plot (simlar to a heatmap)
Here is a so-so example of stacking multiple heatmaps on top of one another
Example of a Facetgrid with multiple scatter plots
Example of how to add a line to a second axis
Example of how to create a log-linear 2d histogram